# -*- coding: utf-8 -*-
"""
Created on Sat May 30 08:27:26 2020

@author: zh
"""
import pandas as pd
import jieba
df_news = pd.read_csv('data/tc-corpus.csv',names=['category','content'],encoding='utf-8')
df_news = df_news.dropna()
print(df_news.head())
content = df_news.content.values.tolist()
content_S = []
for line in content:
    current_segment = jieba.lcut(line)
    if len(current_segment) > 1 and current_segment != '\r\n': #换行符
        content_S.append(current_segment)
df_content=pd.DataFrame({'content_S':content_S})
stopwords=pd.read_csv("data/stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
def drop_stopwords(contents,stopwords):
    contents_clean = []
    for line in contents:
        line_clean = []
        for word in line:
            if word in stopwords or len((str(word)).strip())==0:
                continue
            line_clean.append(word)
        contents_clean.append('-'.join(line_clean))
    return contents_clean
    #print (contents_clean)
        

contents = df_content.content_S.values.tolist()    
stopwords = stopwords.stopword.values.tolist()
contents_clean = drop_stopwords(contents,stopwords)
df_news.content=contents_clean
print(df_news.head())
df_news.to_csv('data/corpus_clean.csv',header=False,encoding='utf-8')
